#import sys
#!{sys.executable} -m pip install pandas-profiling
#Libraries to help with reading data and manipulating data
import numpy as np
import pandas as pd
import matplotlib.pyplot as plt
# Libraries that support data visualization
import matplotlib.pyplot as plt
%matplotlib inline
import seaborn as sns
# Libraries to suppress warnings
import warnings
warnings.filterwarnings("ignore")
sns.set()
# to split the data into train and test
from sklearn.model_selection import train_test_split
# to build linear regression_model
from sklearn.linear_model import LinearRegression
# to check model performance
from sklearn.metrics import mean_absolute_error, mean_squared_error, r2_score
# to build linear regression_model using statsmodels
import statsmodels.api as sm
#Load the dataset
data = pd.read_csv("used_phone_data.csv")
#copying data to another data frame to avaoid changes in the original data
df = data.copy()
df.head() # display 5 rows from the dataframe
| brand_name | os | screen_size | 4g | 5g | main_camera_mp | selfie_camera_mp | int_memory | ram | battery | weight | release_year | days_used | new_price | used_price | |
|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|
| 0 | Honor | Android | 23.97 | yes | no | 13.0 | 5.0 | 64.0 | 3.0 | 3020.0 | 146.0 | 2020 | 127 | 111.62 | 86.96 |
| 1 | Honor | Android | 28.10 | yes | yes | 13.0 | 16.0 | 128.0 | 8.0 | 4300.0 | 213.0 | 2020 | 325 | 249.39 | 161.49 |
| 2 | Honor | Android | 24.29 | yes | yes | 13.0 | 8.0 | 128.0 | 8.0 | 4200.0 | 213.0 | 2020 | 162 | 359.47 | 268.55 |
| 3 | Honor | Android | 26.04 | yes | yes | 13.0 | 8.0 | 64.0 | 6.0 | 7250.0 | 480.0 | 2020 | 345 | 278.93 | 180.23 |
| 4 | Honor | Android | 15.72 | yes | no | 13.0 | 8.0 | 64.0 | 3.0 | 5000.0 | 185.0 | 2020 | 293 | 140.87 | 103.80 |
df.shape
(3571, 15)
df.info()
<class 'pandas.core.frame.DataFrame'> RangeIndex: 3571 entries, 0 to 3570 Data columns (total 15 columns): # Column Non-Null Count Dtype --- ------ -------------- ----- 0 brand_name 3571 non-null object 1 os 3571 non-null object 2 screen_size 3571 non-null float64 3 4g 3571 non-null object 4 5g 3571 non-null object 5 main_camera_mp 3391 non-null float64 6 selfie_camera_mp 3569 non-null float64 7 int_memory 3561 non-null float64 8 ram 3561 non-null float64 9 battery 3565 non-null float64 10 weight 3564 non-null float64 11 release_year 3571 non-null int64 12 days_used 3571 non-null int64 13 new_price 3571 non-null float64 14 used_price 3571 non-null float64 dtypes: float64(9), int64(2), object(4) memory usage: 418.6+ KB
df.describe().T
| count | mean | std | min | 25% | 50% | 75% | max | |
|---|---|---|---|---|---|---|---|---|
| screen_size | 3571.0 | 14.803892 | 5.153092 | 2.700 | 12.700 | 13.49 | 16.510 | 46.36 |
| main_camera_mp | 3391.0 | 9.400454 | 4.818396 | 0.080 | 5.000 | 8.00 | 13.000 | 48.00 |
| selfie_camera_mp | 3569.0 | 6.547352 | 6.879359 | 0.300 | 2.000 | 5.00 | 8.000 | 32.00 |
| int_memory | 3561.0 | 54.532607 | 84.696246 | 0.005 | 16.000 | 32.00 | 64.000 | 1024.00 |
| ram | 3561.0 | 4.056962 | 1.391844 | 0.030 | 4.000 | 4.00 | 4.000 | 16.00 |
| battery | 3565.0 | 3067.225666 | 1364.206665 | 80.000 | 2100.000 | 3000.00 | 4000.000 | 12000.00 |
| weight | 3564.0 | 179.424285 | 90.280856 | 23.000 | 140.000 | 159.00 | 184.000 | 950.00 |
| release_year | 3571.0 | 2015.964996 | 2.291784 | 2013.000 | 2014.000 | 2016.00 | 2018.000 | 2020.00 |
| days_used | 3571.0 | 675.391487 | 248.640972 | 91.000 | 536.000 | 690.00 | 872.000 | 1094.00 |
| new_price | 3571.0 | 237.389037 | 197.545581 | 9.130 | 120.130 | 189.80 | 291.935 | 2560.20 |
| used_price | 3571.0 | 109.880277 | 121.501226 | 2.510 | 45.205 | 75.53 | 126.000 | 1916.54 |
By default the describe() function shows only the summary of numeric variables only. Let's check the summary of non-numeric variables.
df.describe(exclude='number').T
| count | unique | top | freq | |
|---|---|---|---|---|
| brand_name | 3571 | 34 | Others | 509 |
| os | 3571 | 4 | Android | 3246 |
| 4g | 3571 | 2 | yes | 2359 |
| 5g | 3571 | 2 | no | 3419 |
Let's check the count of each unique category in each of the categorical variables.
category = ['brand_name', 'os', '4g', '5g']
for column in category:
print(df[column].value_counts())
print('_'*40)
Others 509 Samsung 364 Huawei 264 LG 212 Lenovo 172 ZTE 141 Xiaomi 134 Oppo 129 Asus 126 Alcatel 125 Nokia 121 Micromax 120 Honor 118 Vivo 117 HTC 110 Motorola 110 Sony 88 Meizu 62 Apple 59 Gionee 56 Acer 51 XOLO 49 Panasonic 47 Realme 41 Celkon 37 Lava 36 Karbonn 30 Spice 30 OnePlus 22 BlackBerry 22 Coolpad 22 Microsoft 22 Google 15 Infinix 10 Name: brand_name, dtype: int64 ________________________________________ Android 3246 Others 202 Windows 67 iOS 56 Name: os, dtype: int64 ________________________________________ yes 2359 no 1212 Name: 4g, dtype: int64 ________________________________________ no 3419 yes 152 Name: 5g, dtype: int64 ________________________________________
df.info()
<class 'pandas.core.frame.DataFrame'> RangeIndex: 3571 entries, 0 to 3570 Data columns (total 15 columns): # Column Non-Null Count Dtype --- ------ -------------- ----- 0 brand_name 3571 non-null object 1 os 3571 non-null object 2 screen_size 3571 non-null float64 3 4g 3571 non-null object 4 5g 3571 non-null object 5 main_camera_mp 3391 non-null float64 6 selfie_camera_mp 3569 non-null float64 7 int_memory 3561 non-null float64 8 ram 3561 non-null float64 9 battery 3565 non-null float64 10 weight 3564 non-null float64 11 release_year 3571 non-null int64 12 days_used 3571 non-null int64 13 new_price 3571 non-null float64 14 used_price 3571 non-null float64 dtypes: float64(9), int64(2), object(4) memory usage: 418.6+ KB
print(df.brand_name.unique())
['Honor' 'Others' 'HTC' 'Huawei' 'Infinix' 'Lava' 'Lenovo' 'LG' 'Meizu' 'Micromax' 'Motorola' 'Nokia' 'OnePlus' 'Oppo' 'Realme' 'Samsung' 'Vivo' 'Xiaomi' 'ZTE' 'Apple' 'Asus' 'Coolpad' 'Acer' 'Alcatel' 'BlackBerry' 'Celkon' 'Gionee' 'Google' 'Karbonn' 'Microsoft' 'Panasonic' 'Sony' 'Spice' 'XOLO']
print(df.os.unique())
['Android' 'Others' 'iOS' 'Windows']
print(df['4g'].unique())
['yes' 'no']
print(df['5g'].unique())
['no' 'yes']
df.describe(include='all').T
| count | unique | top | freq | mean | std | min | 25% | 50% | 75% | max | |
|---|---|---|---|---|---|---|---|---|---|---|---|
| brand_name | 3571 | 34 | Others | 509 | NaN | NaN | NaN | NaN | NaN | NaN | NaN |
| os | 3571 | 4 | Android | 3246 | NaN | NaN | NaN | NaN | NaN | NaN | NaN |
| screen_size | 3571.0 | NaN | NaN | NaN | 14.803892 | 5.153092 | 2.7 | 12.7 | 13.49 | 16.51 | 46.36 |
| 4g | 3571 | 2 | yes | 2359 | NaN | NaN | NaN | NaN | NaN | NaN | NaN |
| 5g | 3571 | 2 | no | 3419 | NaN | NaN | NaN | NaN | NaN | NaN | NaN |
| main_camera_mp | 3391.0 | NaN | NaN | NaN | 9.400454 | 4.818396 | 0.08 | 5.0 | 8.0 | 13.0 | 48.0 |
| selfie_camera_mp | 3569.0 | NaN | NaN | NaN | 6.547352 | 6.879359 | 0.3 | 2.0 | 5.0 | 8.0 | 32.0 |
| int_memory | 3561.0 | NaN | NaN | NaN | 54.532607 | 84.696246 | 0.005 | 16.0 | 32.0 | 64.0 | 1024.0 |
| ram | 3561.0 | NaN | NaN | NaN | 4.056962 | 1.391844 | 0.03 | 4.0 | 4.0 | 4.0 | 16.0 |
| battery | 3565.0 | NaN | NaN | NaN | 3067.225666 | 1364.206665 | 80.0 | 2100.0 | 3000.0 | 4000.0 | 12000.0 |
| weight | 3564.0 | NaN | NaN | NaN | 179.424285 | 90.280856 | 23.0 | 140.0 | 159.0 | 184.0 | 950.0 |
| release_year | 3571.0 | NaN | NaN | NaN | 2015.964996 | 2.291784 | 2013.0 | 2014.0 | 2016.0 | 2018.0 | 2020.0 |
| days_used | 3571.0 | NaN | NaN | NaN | 675.391487 | 248.640972 | 91.0 | 536.0 | 690.0 | 872.0 | 1094.0 |
| new_price | 3571.0 | NaN | NaN | NaN | 237.389037 | 197.545581 | 9.13 | 120.13 | 189.8 | 291.935 | 2560.2 |
| used_price | 3571.0 | NaN | NaN | NaN | 109.880277 | 121.501226 | 2.51 | 45.205 | 75.53 | 126.0 | 1916.54 |
df.isna().sum()
brand_name 0 os 0 screen_size 0 4g 0 5g 0 main_camera_mp 180 selfie_camera_mp 2 int_memory 10 ram 10 battery 6 weight 7 release_year 0 days_used 0 new_price 0 used_price 0 dtype: int64
df.isnull().sum()
brand_name 0 os 0 screen_size 0 4g 0 5g 0 main_camera_mp 180 selfie_camera_mp 2 int_memory 10 ram 10 battery 6 weight 7 release_year 0 days_used 0 new_price 0 used_price 0 dtype: int64
df['os'] = df['os'].astype('category')
df['brand_name'] = df['brand_name'].astype('category')
df['5g'] = df['5g'].astype('category')
df['4g'] = df['4g'].astype('category')
sns.histplot(df.screen_size, kde=True);
sns.boxplot(df.screen_size,orient = "h");
sns.histplot(df.main_camera_mp, kde=True);
sns.boxplot(df.main_camera_mp,orient = "h");
sns.histplot(df.selfie_camera_mp, kde=True);
sns.histplot(df.int_memory, kde=True,binwidth=50);
sns.boxplot(df.int_memory,orient = "h");
sns.histplot(df.ram, kde=True);
sns.boxplot(df.ram,orient = "h");
sns.histplot(df.battery, kde=True);
sns.boxplot(df.battery,orient = "h");
sns.histplot(df.weight, kde=True, binwidth=50);
sns.boxplot(df.weight,orient = "h");
sns.histplot(df.release_year, kde=True);
sns.boxplot(df.release_year,orient = "h");
sns.histplot(df.days_used, kde=True);
sns.boxplot(df.days_used,orient = "h");
There are phones which are used for 600 - 1000 days
sns.histplot(df.new_price, kde=True, binwidth=50);
sns.boxplot(df.new_price,orient = "h");
The graph is right skewed. There are many phones ~100 -150 Euros price range. There are also high priced phones
sns.histplot(df.used_price, kde=True, binwidth=50);
sns.boxplot(df.used_price,orient = "h");
sns.histplot(df.os);
sns.histplot(df['4g']);
Many phones have 4g
sns.histplot(df['5g']);
Most of the used_phones donot have 5g
sns.heatmap(df.corr(),cmap="YlGnBu",vmin=-1,vmax=1, annot=True)
<AxesSubplot:>
sns.pairplot(df, hue='os');
fig = plt.figure(figsize= (10,5))
ax = sns.regplot(x ='release_year', y = df['used_price'], data = df)
fig = plt.figure(figsize= (10,5))
ax = sns.regplot(x ='days_used', y = df['used_price'], data = df)
Most of the used_phones fall below 250 Euros price range
fig = plt.figure(figsize= (10,5))
ax = sns.regplot(x ='new_price', y = df['used_price'], data = df)
The used_price and new_price are linearly related.
fig = plt.figure(figsize= (10,5))
ax = sns.regplot(x ='ram', y = df['used_price'], data = df)
Most of the used_phones have Ram between 4-12GB
fig = plt.figure(figsize= (10,5))
ax = sns.regplot(x ='int_memory', y = df['used_price'], data = df)
int_memory and used_price are linearly related.
fig = plt.figure(figsize= (10,5))
ax = sns.regplot(x ='weight', y = df['used_price'], data = df)
Most of the used_phones are below 200 grams and their price range is below 500 Euros.
fig = plt.figure(figsize= (10,5))
ax = sns.regplot(x ='battery', y = df['used_price'], data = df)
Most of the used phones have 4000mAh battery. There are also phones with 10000mAh.
fig = plt.figure(figsize= (10,5))
ax = sns.regplot(x ='main_camera_mp', y = df['used_price'], data = df)
main_camera_mp and used_price are linearly related. Most of the used phones have 8-14GB RAM.
fig = plt.figure(figsize= (10,5))
ax = sns.regplot(x ='selfie_camera_mp', y = df['used_price'], data = df)
selfie_camera_mp and used_price are linearly related.
fig = plt.figure(figsize= (10,5))
ax = sns.regplot(x ='screen_size', y = df['used_price'], data = df)
The screen_size of the highest price phone is ~20cm. The screen size of the used_phones are below 35 cms. There are phones whose screen_size is 44cm
sns.countplot(x="os", hue="brand_name", data=df, palette='Set1',saturation=50 );
sns.countplot(x="os", hue="4g", data=df, palette='Set1',saturation=50 );
sns.countplot(x="os", hue="5g", data=df, palette='Set1',saturation=50 );
sns.countplot(x="5g", hue="brand_name", data=df, palette='Set1',saturation=50 );
Most of the used_phones have no 5g
sns.countplot(x="5g", hue="4g", data=df, palette='Set1',saturation=50 );
Very few phones have both 4g and 5g
plt.figure(figsize=(15,5))
ax = sns.barplot(x='os', y='used_price', data=data)
plt.figure(figsize=(15,5))
ax = sns.barplot(x='brand_name', y='used_price', data=data)
plt.figure(figsize=(15,5))
ax = sns.barplot(x='4g', y='used_price', data=data)
plt.figure(figsize=(15,5))
ax = sns.barplot(x='5g', y='used_price', data=data)
The used_phones with 5g enables have high used_price.
#figure = plt.figure(figsize=(8,7))
#sns.barplot(x="brand_name", y="used_price", data=df, hue='os', palette='tab10' )
#plt.show()
g = sns.FacetGrid(df, col="brand_name", height=4, aspect=.5)
g.map(sns.barplot, "os", "used_price", order=["Android", "Others", "iOS", "Windows"])
<seaborn.axisgrid.FacetGrid at 0x1ff50840310>
g = sns.FacetGrid(df, col="brand_name", height=4, aspect=.5)
g.map(sns.barplot, "4g", "used_price", order=["yes", "no"])
<seaborn.axisgrid.FacetGrid at 0x1ff4b970d30>
g = sns.FacetGrid(df, col="brand_name", height=4, aspect=.5)
g.map(sns.barplot, "5g", "used_price", order=["yes", "no"])
<seaborn.axisgrid.FacetGrid at 0x1ff489b1f70>
1.Most of the Huawei, LG, Meizu, Motorola, Nokia, oneplusone,Oppo, Samsung, Vivo, Xiaomi, ZTE company phones have 5g are highly priced
g = sns.FacetGrid(df, col="brand_name", height=4, aspect=.5)
g.map(sns.barplot, "4g", "ram", order=["yes", "no"])
<seaborn.axisgrid.FacetGrid at 0x1ff50affc10>
There seems to be not much relation between RAM and 4g. Though among some brands, that have 4g have more RAM.
g = sns.FacetGrid(df, col="brand_name", height=4, aspect=.5)
g.map(sns.barplot, "5g", "ram", order=["yes", "no"])
<seaborn.axisgrid.FacetGrid at 0x1ff580f0af0>
Though among some brands, that have 5g reguire more RAM.
g = sns.FacetGrid(df, col="brand_name", height=4, aspect=.5)
g.map(sns.barplot, "os", "ram", order=["Android", "Others", "iOS", "Windows"])
<seaborn.axisgrid.FacetGrid at 0x1ff4bbe0d00>
ANdroid phones seems to be having more RAM followed by Others then Windows
Questions:
sns.histplot(df.used_price, kde=True, binwidth=50);
Maximum used_price is ~2000 Euros. There are many phones used used price is below 200 Euros. There distribution is right skewed and there are many outliers.
sns.histplot(df.os);
def labeled_barplot(data, feature, perc=False, n=None):
total = len(data[feature]) # length of the column
count = data[feature].nunique()
if n is None:
plt.figure(figsize=(count + 1, 5))
else:
plt.figure(figsize=(n + 1, 5))
plt.xticks(rotation=90, fontsize=15)
ax = sns.countplot(
data=data,
x=feature,
palette="Paired",
order=data[feature].value_counts().index[:n].sort_values(),
)
for p in ax.patches:
if perc == True:
label = "{:.1f}%".format(
100 * p.get_height() / total
)
else:
label = p.get_height() # count of each level of the category
x = p.get_x() + p.get_width() / 2 # width of the plot
y = p.get_height() # height of the plot
ax.annotate(
label,
(x, y),
ha="center",
va="center",
size=12,
xytext=(0, 5),
textcoords="offset points",
)
plt.show()
labeled_barplot(df, "os", perc=True)
About 90.9% of the used_phones are dominated by Android devices
g = sns.FacetGrid(df, col="brand_name")
g.map(sns.histplot, "ram")
#plt.figure(figsize=(20,5))
#ax = sns.jointplot(x='brand_name', y='ram', data=data)
<seaborn.axisgrid.FacetGrid at 0x1ff59291310>
df["heavybat"] = np.where(df["battery"] > 4500, "heavybat", "lightbat")
#fig = plt.figure(figsize= (10,5))
#ax = sns.regplot(x ='heavybat', y = df['weight'], data = df)
df
| brand_name | os | screen_size | 4g | 5g | main_camera_mp | selfie_camera_mp | int_memory | ram | battery | weight | release_year | days_used | new_price | used_price | heavybat | |
|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|
| 0 | Honor | Android | 23.97 | yes | no | 13.0 | 5.0 | 64.0 | 3.0 | 3020.0 | 146.0 | 2020 | 127 | 111.6200 | 86.96 | lightbat |
| 1 | Honor | Android | 28.10 | yes | yes | 13.0 | 16.0 | 128.0 | 8.0 | 4300.0 | 213.0 | 2020 | 325 | 249.3900 | 161.49 | lightbat |
| 2 | Honor | Android | 24.29 | yes | yes | 13.0 | 8.0 | 128.0 | 8.0 | 4200.0 | 213.0 | 2020 | 162 | 359.4700 | 268.55 | lightbat |
| 3 | Honor | Android | 26.04 | yes | yes | 13.0 | 8.0 | 64.0 | 6.0 | 7250.0 | 480.0 | 2020 | 345 | 278.9300 | 180.23 | heavybat |
| 4 | Honor | Android | 15.72 | yes | no | 13.0 | 8.0 | 64.0 | 3.0 | 5000.0 | 185.0 | 2020 | 293 | 140.8700 | 103.80 | heavybat |
| ... | ... | ... | ... | ... | ... | ... | ... | ... | ... | ... | ... | ... | ... | ... | ... | ... |
| 3566 | Asus | Android | 15.88 | yes | no | NaN | 8.0 | 64.0 | 6.0 | 5000.0 | 190.0 | 2019 | 232 | 654.5000 | 490.96 | heavybat |
| 3567 | Asus | Android | 15.24 | yes | no | 13.0 | 8.0 | 128.0 | 8.0 | 4000.0 | 200.0 | 2018 | 541 | 518.8145 | 259.30 | lightbat |
| 3568 | Alcatel | Android | 18.73 | yes | no | 13.0 | 5.0 | 32.0 | 3.0 | 4000.0 | 165.0 | 2020 | 201 | 92.6500 | 69.81 | lightbat |
| 3569 | Alcatel | Android | 18.73 | yes | no | 13.0 | 5.0 | 32.0 | 2.0 | 4000.0 | 160.0 | 2020 | 149 | 101.9150 | 76.07 | lightbat |
| 3570 | Alcatel | Android | 13.49 | yes | no | 13.0 | 5.0 | 16.0 | 2.0 | 4000.0 | 168.0 | 2020 | 176 | 72.2415 | 50.48 | lightbat |
3571 rows × 16 columns
plt.figure(figsize=(15,5))
ax = sns.barplot(x=df['heavybat'], y='weight', data=data)
df["screen_grp"] = np.where(df["screen_size"] > 6, "larger_screen", "smaller_screen")
df["screen_grp"].value_counts()
larger_screen 3450 smaller_screen 121 Name: screen_grp, dtype: int64
sns.histplot(df.screen_grp);
There are 3450 used_phones that have scree_size greater than 6 inches.
temp = df[df['selfie_camera_mp'] > 8].copy()
temp
| brand_name | os | screen_size | 4g | 5g | main_camera_mp | selfie_camera_mp | int_memory | ram | battery | weight | release_year | days_used | new_price | used_price | heavybat | screen_grp | |
|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|
| 1 | Honor | Android | 28.10 | yes | yes | 13.0 | 16.0 | 128.0 | 8.0 | 4300.0 | 213.0 | 2020 | 325 | 249.3900 | 161.49 | lightbat | larger_screen |
| 8 | Honor | Android | 15.72 | yes | no | 13.0 | 16.0 | 128.0 | 6.0 | 4000.0 | 165.0 | 2020 | 161 | 200.3200 | 150.88 | lightbat | larger_screen |
| 12 | Honor | Android | 24.29 | yes | yes | 13.0 | 16.0 | 128.0 | 8.0 | 4100.0 | 206.0 | 2019 | 537 | 499.6800 | 249.81 | lightbat | larger_screen |
| 13 | Honor | Others | 8.73 | no | no | 13.0 | 16.0 | 4.0 | 4.0 | 455.0 | 41.0 | 2019 | 432 | 179.6200 | 89.77 | lightbat | larger_screen |
| 14 | Honor | Android | 15.72 | yes | no | 13.0 | 16.0 | 64.0 | 4.0 | 4000.0 | 171.5 | 2019 | 336 | 221.1300 | 144.87 | lightbat | larger_screen |
| ... | ... | ... | ... | ... | ... | ... | ... | ... | ... | ... | ... | ... | ... | ... | ... | ... | ... |
| 3555 | Samsung | Android | 15.88 | yes | no | 12.0 | 32.0 | 64.0 | 4.0 | 4000.0 | 169.0 | 2019 | 315 | 231.1915 | 149.38 | lightbat | larger_screen |
| 3556 | Samsung | Android | 16.51 | yes | no | 12.0 | 10.0 | 256.0 | 12.0 | 4300.0 | 196.0 | 2019 | 489 | 671.4915 | 335.83 | lightbat | larger_screen |
| 3557 | Samsung | Android | 15.72 | yes | no | 12.0 | 10.0 | 256.0 | 8.0 | 3500.0 | 168.0 | 2019 | 429 | 518.5000 | 259.17 | lightbat | larger_screen |
| 3558 | Samsung | Others | 3.18 | yes | no | 12.0 | 9.0 | 4.0 | 1.5 | 340.0 | 42.0 | 2019 | 524 | 240.6435 | 120.38 | lightbat | smaller_screen |
| 3565 | Asus | Android | 24.61 | yes | no | NaN | 24.0 | 128.0 | 8.0 | 6000.0 | 240.0 | 2019 | 325 | 1163.6500 | 756.99 | heavybat | larger_screen |
666 rows × 17 columns
sns.histplot(temp.selfie_camera_mp);
sns.heatmap(df.corr(),cmap="YlGnBu",vmin=-1,vmax=1, annot=True)
<AxesSubplot:>
df = df.drop(['heavybat','screen_grp'],axis =1)
np.random.seed(42)
df.sample(n=10)#Return a random sample of 10 rows from the dataframe 'data'
| brand_name | os | screen_size | 4g | 5g | main_camera_mp | selfie_camera_mp | int_memory | ram | battery | weight | release_year | days_used | new_price | used_price | |
|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|
| 457 | Acer | Android | 10.95 | yes | no | 5.0 | 2.0 | 32.0 | 4.0 | 2000.0 | 145.0 | 2015 | 1044 | 129.00 | 38.77 |
| 1647 | Lenovo | Android | 25.56 | yes | no | 8.0 | 1.6 | 16.0 | 4.0 | 9600.0 | 619.0 | 2014 | 723 | 300.61 | 120.29 |
| 351 | Oppo | Android | 23.97 | yes | yes | NaN | 32.0 | 128.0 | 8.0 | 4000.0 | 172.0 | 2020 | 293 | 470.81 | 335.09 |
| 1667 | Lenovo | Android | 12.70 | no | no | 13.0 | 5.0 | 16.0 | 4.0 | 2000.0 | 140.0 | 2014 | 721 | 199.93 | 80.14 |
| 1849 | LG | Android | 10.64 | no | no | 8.0 | 0.3 | 16.0 | 4.0 | 2460.0 | 115.5 | 2013 | 644 | 150.71 | 60.18 |
| 2111 | Nokia | Android | 23.97 | yes | no | 5.0 | 8.0 | 32.0 | 4.0 | 3000.0 | 161.0 | 2019 | 492 | 149.04 | 74.56 |
| 1366 | Huawei | Android | 12.70 | yes | no | 13.0 | 5.0 | 16.0 | 4.0 | 2200.0 | 140.0 | 2016 | 1069 | 130.90 | 39.27 |
| 70 | Lenovo | Android | 20.32 | yes | no | 13.0 | 5.0 | 16.0 | 2.0 | 5100.0 | 305.0 | 2019 | 461 | 158.56 | 79.14 |
| 1206 | HTC | Android | 13.81 | yes | no | 13.0 | 13.0 | 64.0 | 4.0 | 3075.0 | 150.0 | 2018 | 687 | 200.26 | 80.33 |
| 1213 | HTC | Android | 13.02 | yes | no | 16.0 | 16.0 | 64.0 | 4.0 | 2600.0 | 142.0 | 2017 | 859 | 210.26 | 63.02 |
df.isnull().sum().sort_values(ascending = False)
#Return the count of missing values column-wise and sort them in descending order
main_camera_mp 180 int_memory 10 ram 10 weight 7 battery 6 selfie_camera_mp 2 brand_name 0 os 0 screen_size 0 4g 0 5g 0 release_year 0 days_used 0 new_price 0 used_price 0 dtype: int64
For the target variable(used_price) we will drop the missing values. For the predictor variables, we will replace missing values in each column with its median.
df.dropna(subset = ['used_price'], inplace = True)
df
| brand_name | os | screen_size | 4g | 5g | main_camera_mp | selfie_camera_mp | int_memory | ram | battery | weight | release_year | days_used | new_price | used_price | |
|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|
| 0 | Honor | Android | 23.97 | yes | no | 13.0 | 5.0 | 64.0 | 3.0 | 3020.0 | 146.0 | 2020 | 127 | 111.6200 | 86.96 |
| 1 | Honor | Android | 28.10 | yes | yes | 13.0 | 16.0 | 128.0 | 8.0 | 4300.0 | 213.0 | 2020 | 325 | 249.3900 | 161.49 |
| 2 | Honor | Android | 24.29 | yes | yes | 13.0 | 8.0 | 128.0 | 8.0 | 4200.0 | 213.0 | 2020 | 162 | 359.4700 | 268.55 |
| 3 | Honor | Android | 26.04 | yes | yes | 13.0 | 8.0 | 64.0 | 6.0 | 7250.0 | 480.0 | 2020 | 345 | 278.9300 | 180.23 |
| 4 | Honor | Android | 15.72 | yes | no | 13.0 | 8.0 | 64.0 | 3.0 | 5000.0 | 185.0 | 2020 | 293 | 140.8700 | 103.80 |
| ... | ... | ... | ... | ... | ... | ... | ... | ... | ... | ... | ... | ... | ... | ... | ... |
| 3566 | Asus | Android | 15.88 | yes | no | NaN | 8.0 | 64.0 | 6.0 | 5000.0 | 190.0 | 2019 | 232 | 654.5000 | 490.96 |
| 3567 | Asus | Android | 15.24 | yes | no | 13.0 | 8.0 | 128.0 | 8.0 | 4000.0 | 200.0 | 2018 | 541 | 518.8145 | 259.30 |
| 3568 | Alcatel | Android | 18.73 | yes | no | 13.0 | 5.0 | 32.0 | 3.0 | 4000.0 | 165.0 | 2020 | 201 | 92.6500 | 69.81 |
| 3569 | Alcatel | Android | 18.73 | yes | no | 13.0 | 5.0 | 32.0 | 2.0 | 4000.0 | 160.0 | 2020 | 149 | 101.9150 | 76.07 |
| 3570 | Alcatel | Android | 13.49 | yes | no | 13.0 | 5.0 | 16.0 | 2.0 | 4000.0 | 168.0 | 2020 | 176 | 72.2415 | 50.48 |
3571 rows × 15 columns
medianFiller = lambda x: x.fillna(x.median())
numeric_columns = df.select_dtypes(include = np.number).columns.tolist()
df[numeric_columns] = df[numeric_columns].apply(medianFiller, axis = 0)
df.isnull().sum()
brand_name 0 os 0 screen_size 0 4g 0 5g 0 main_camera_mp 0 selfie_camera_mp 0 int_memory 0 ram 0 battery 0 weight 0 release_year 0 days_used 0 new_price 0 used_price 0 dtype: int64
Missing values have been treated.
# Lets look at the statistical summary of the data
df.describe(include="all").T
| count | unique | top | freq | mean | std | min | 25% | 50% | 75% | max | |
|---|---|---|---|---|---|---|---|---|---|---|---|
| brand_name | 3571 | 34 | Others | 509 | NaN | NaN | NaN | NaN | NaN | NaN | NaN |
| os | 3571 | 4 | Android | 3246 | NaN | NaN | NaN | NaN | NaN | NaN | NaN |
| screen_size | 3571.0 | NaN | NaN | NaN | 14.803892 | 5.153092 | 2.7 | 12.7 | 13.49 | 16.51 | 46.36 |
| 4g | 3571 | 2 | yes | 2359 | NaN | NaN | NaN | NaN | NaN | NaN | NaN |
| 5g | 3571 | 2 | no | 3419 | NaN | NaN | NaN | NaN | NaN | NaN | NaN |
| main_camera_mp | 3571.0 | NaN | NaN | NaN | 9.329863 | 4.705341 | 0.08 | 5.0 | 8.0 | 13.0 | 48.0 |
| selfie_camera_mp | 3571.0 | NaN | NaN | NaN | 6.546486 | 6.877529 | 0.3 | 2.0 | 5.0 | 8.0 | 32.0 |
| int_memory | 3571.0 | NaN | NaN | NaN | 54.469509 | 84.585924 | 0.005 | 16.0 | 32.0 | 64.0 | 1024.0 |
| ram | 3571.0 | NaN | NaN | NaN | 4.056802 | 1.389897 | 0.03 | 4.0 | 4.0 | 4.0 | 16.0 |
| battery | 3571.0 | NaN | NaN | NaN | 3067.112714 | 1363.062572 | 80.0 | 2100.0 | 3000.0 | 4000.0 | 12000.0 |
| weight | 3571.0 | NaN | NaN | NaN | 179.384248 | 90.196828 | 23.0 | 140.0 | 159.0 | 184.0 | 950.0 |
| release_year | 3571.0 | NaN | NaN | NaN | 2015.964996 | 2.291784 | 2013.0 | 2014.0 | 2016.0 | 2018.0 | 2020.0 |
| days_used | 3571.0 | NaN | NaN | NaN | 675.391487 | 248.640972 | 91.0 | 536.0 | 690.0 | 872.0 | 1094.0 |
| new_price | 3571.0 | NaN | NaN | NaN | 237.389037 | 197.545581 | 9.13 | 120.13 | 189.8 | 291.935 | 2560.2 |
| used_price | 3571.0 | NaN | NaN | NaN | 109.880277 | 121.501226 | 2.51 | 45.205 | 75.53 | 126.0 | 1916.54 |
# function to plot a boxplot and a histogram along the same scale.
def histogram_boxplot(data, feature, figsize=(12, 7), kde=False, bins=None):
"""
Boxplot and histogram combined
data: dataframe
feature: dataframe column
figsize: size of figure (default (12,7))
kde: whether to the show density curve (default False)
bins: number of bins for histogram (default None)
"""
f2, (ax_box2, ax_hist2) = plt.subplots(
nrows=2, # Number of rows of the subplot grid= 2
sharex=True, # x-axis will be shared among all subplots
gridspec_kw={"height_ratios": (0.25, 0.75)},
figsize=figsize,
) # creating the 2 subplots
sns.boxplot(
data=data, x=feature, ax=ax_box2, showmeans=True, color="violet"
) # boxplot will be created and a star will indicate the mean value of the column
sns.histplot(
data=data, x=feature, kde=kde, ax=ax_hist2, bins=bins, palette="winter"
) if bins else sns.histplot(
data=data, x=feature, kde=kde, ax=ax_hist2
) # For histogram
ax_hist2.axvline(
data[feature].mean(), color="green", linestyle="--"
) # Add mean to the histogram
ax_hist2.axvline(
data[feature].median(), color="black", linestyle="-"
) # Add median to the histogram
histogram_boxplot(df,"used_price")
Brand_name column has 34 unique variables. Lets group all brand names based on new_price
df['brand_type'] = pd.cut(data['new_price'], bins = [0,500,1500,3000], labels = ['Low', 'Medium', 'High'])
df
| brand_name | os | screen_size | 4g | 5g | main_camera_mp | selfie_camera_mp | int_memory | ram | battery | weight | release_year | days_used | new_price | used_price | brand_type | |
|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|
| 0 | Honor | Android | 23.97 | yes | no | 13.0 | 5.0 | 64.0 | 3.0 | 3020.0 | 146.0 | 2020 | 127 | 111.6200 | 86.96 | Low |
| 1 | Honor | Android | 28.10 | yes | yes | 13.0 | 16.0 | 128.0 | 8.0 | 4300.0 | 213.0 | 2020 | 325 | 249.3900 | 161.49 | Low |
| 2 | Honor | Android | 24.29 | yes | yes | 13.0 | 8.0 | 128.0 | 8.0 | 4200.0 | 213.0 | 2020 | 162 | 359.4700 | 268.55 | Low |
| 3 | Honor | Android | 26.04 | yes | yes | 13.0 | 8.0 | 64.0 | 6.0 | 7250.0 | 480.0 | 2020 | 345 | 278.9300 | 180.23 | Low |
| 4 | Honor | Android | 15.72 | yes | no | 13.0 | 8.0 | 64.0 | 3.0 | 5000.0 | 185.0 | 2020 | 293 | 140.8700 | 103.80 | Low |
| ... | ... | ... | ... | ... | ... | ... | ... | ... | ... | ... | ... | ... | ... | ... | ... | ... |
| 3566 | Asus | Android | 15.88 | yes | no | 8.0 | 8.0 | 64.0 | 6.0 | 5000.0 | 190.0 | 2019 | 232 | 654.5000 | 490.96 | Medium |
| 3567 | Asus | Android | 15.24 | yes | no | 13.0 | 8.0 | 128.0 | 8.0 | 4000.0 | 200.0 | 2018 | 541 | 518.8145 | 259.30 | Medium |
| 3568 | Alcatel | Android | 18.73 | yes | no | 13.0 | 5.0 | 32.0 | 3.0 | 4000.0 | 165.0 | 2020 | 201 | 92.6500 | 69.81 | Low |
| 3569 | Alcatel | Android | 18.73 | yes | no | 13.0 | 5.0 | 32.0 | 2.0 | 4000.0 | 160.0 | 2020 | 149 | 101.9150 | 76.07 | Low |
| 3570 | Alcatel | Android | 13.49 | yes | no | 13.0 | 5.0 | 16.0 | 2.0 | 4000.0 | 168.0 | 2020 | 176 | 72.2415 | 50.48 | Low |
3571 rows × 16 columns
df['brand_type'].value_counts()
Low 3325 Medium 237 High 9 Name: brand_type, dtype: int64
df['brand_type'].unique()
['Low', 'Medium', 'High'] Categories (3, object): ['Low' < 'Medium' < 'High']
df.isnull().sum() # checking for null-values
brand_name 0 os 0 screen_size 0 4g 0 5g 0 main_camera_mp 0 selfie_camera_mp 0 int_memory 0 ram 0 battery 0 weight 0 release_year 0 days_used 0 new_price 0 used_price 0 brand_type 0 dtype: int64
sns.histplot(df["brand_type"])
<AxesSubplot:xlabel='brand_type', ylabel='Count'>
#let's plot the boxplots of all columns to check for outliers
plt.figure(figsize=(20, 30))
for i, variable in enumerate(numeric_columns):
plt.subplot(5, 4, i + 1)
plt.boxplot(df[variable], whis=1.5)
plt.tight_layout()
plt.title(variable)
plt.show()
# Outlier treatment
def treatment_outliers(df, col):
Q1 = df[col].quantile(0.25) # 25th quantile
Q3 = df[col].quantile(0.75) # 75th quantile
IQR = Q3 - Q1
Lower_Whisker = Q1 - 1.5 * IQR
Upper_Whisker = Q3 + 1.5 * IQR
# all the values smaller than Lower_Whisker will be assigned the value of Lower_Whisker
# all the values greater than Upper_Whisker will be assigned the value of Upper_Whisker
df[col] = np.clip(df[col], Lower_Whisker, Upper_Whisker)
return df
def treat_all_outliers(df, col_list):
for c in col_list:
df = treatment_outliers(df, c)
return df
# treating the outliers
numerical_col = df.select_dtypes(include=np.number).columns.tolist()
df = treat_all_outliers(df, numerical_col)
# let's look at the boxplots to see if the outliers have been treated or not
plt.figure(figsize=(20, 30))
for i, variable in enumerate(numeric_columns):
plt.subplot(5, 4, i + 1)
plt.boxplot(df[variable], whis=1.5)
plt.tight_layout()
plt.title(variable)
plt.show()
df.head()
| brand_name | os | screen_size | 4g | 5g | main_camera_mp | selfie_camera_mp | int_memory | ram | battery | weight | release_year | days_used | new_price | used_price | brand_type | |
|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|
| 0 | Honor | Android | 22.225 | yes | no | 13.0 | 5.0 | 64.0 | 4.0 | 3020.0 | 146.0 | 2020 | 127 | 111.62 | 86.9600 | Low |
| 1 | Honor | Android | 22.225 | yes | yes | 13.0 | 16.0 | 128.0 | 4.0 | 4300.0 | 213.0 | 2020 | 325 | 249.39 | 161.4900 | Low |
| 2 | Honor | Android | 22.225 | yes | yes | 13.0 | 8.0 | 128.0 | 4.0 | 4200.0 | 213.0 | 2020 | 162 | 359.47 | 247.1925 | Low |
| 3 | Honor | Android | 22.225 | yes | yes | 13.0 | 8.0 | 64.0 | 4.0 | 6850.0 | 250.0 | 2020 | 345 | 278.93 | 180.2300 | Low |
| 4 | Honor | Android | 15.720 | yes | no | 13.0 | 8.0 | 64.0 | 4.0 | 5000.0 | 185.0 | 2020 | 293 | 140.87 | 103.8000 | Low |
df.shape
(3571, 16)
df.info()
<class 'pandas.core.frame.DataFrame'> Int64Index: 3571 entries, 0 to 3570 Data columns (total 16 columns): # Column Non-Null Count Dtype --- ------ -------------- ----- 0 brand_name 3571 non-null category 1 os 3571 non-null category 2 screen_size 3571 non-null float64 3 4g 3571 non-null category 4 5g 3571 non-null category 5 main_camera_mp 3571 non-null float64 6 selfie_camera_mp 3571 non-null float64 7 int_memory 3571 non-null float64 8 ram 3571 non-null float64 9 battery 3571 non-null float64 10 weight 3571 non-null float64 11 release_year 3571 non-null int64 12 days_used 3571 non-null int64 13 new_price 3571 non-null float64 14 used_price 3571 non-null float64 15 brand_type 3571 non-null category dtypes: category(5), float64(9), int64(2) memory usage: 354.1 KB
df.describe().T
| count | mean | std | min | 25% | 50% | 75% | max | |
|---|---|---|---|---|---|---|---|---|
| screen_size | 3571.0 | 14.521803 | 4.165771 | 6.985 | 12.700 | 13.49 | 16.510 | 22.2250 |
| main_camera_mp | 3571.0 | 9.299619 | 4.530650 | 0.080 | 5.000 | 8.00 | 13.000 | 25.0000 |
| selfie_camera_mp | 3571.0 | 5.972417 | 5.287271 | 0.300 | 2.000 | 5.00 | 8.000 | 17.0000 |
| int_memory | 3571.0 | 44.583202 | 38.938853 | 0.005 | 16.000 | 32.00 | 64.000 | 136.0000 |
| ram | 3571.0 | 4.000000 | 0.000000 | 4.000 | 4.000 | 4.00 | 4.000 | 4.0000 |
| battery | 3571.0 | 3042.193083 | 1272.573404 | 80.000 | 2100.000 | 3000.00 | 4000.000 | 6850.0000 |
| weight | 3571.0 | 164.415584 | 41.098253 | 74.000 | 140.000 | 159.00 | 184.000 | 250.0000 |
| release_year | 3571.0 | 2015.964996 | 2.291784 | 2013.000 | 2014.000 | 2016.00 | 2018.000 | 2020.0000 |
| days_used | 3571.0 | 675.391487 | 248.640972 | 91.000 | 536.000 | 690.00 | 872.000 | 1094.0000 |
| new_price | 3571.0 | 221.841506 | 135.411699 | 9.130 | 120.130 | 189.80 | 291.935 | 549.6425 |
| used_price | 3571.0 | 95.528957 | 66.145611 | 2.510 | 45.205 | 75.53 | 126.000 | 247.1925 |
df.describe(exclude='number').T
| count | unique | top | freq | |
|---|---|---|---|---|
| brand_name | 3571 | 34 | Others | 509 |
| os | 3571 | 4 | Android | 3246 |
| 4g | 3571 | 2 | yes | 2359 |
| 5g | 3571 | 2 | no | 3419 |
| brand_type | 3571 | 3 | Low | 3325 |
category = ['brand_name', 'os', '4g', '5g','brand_type']
for column in category:
print(df[column].value_counts())
print('_'*40)
Others 509 Samsung 364 Huawei 264 LG 212 Lenovo 172 ZTE 141 Xiaomi 134 Oppo 129 Asus 126 Alcatel 125 Nokia 121 Micromax 120 Honor 118 Vivo 117 HTC 110 Motorola 110 Sony 88 Meizu 62 Apple 59 Gionee 56 Acer 51 XOLO 49 Panasonic 47 Realme 41 Celkon 37 Lava 36 Karbonn 30 Spice 30 Microsoft 22 OnePlus 22 Coolpad 22 BlackBerry 22 Google 15 Infinix 10 Name: brand_name, dtype: int64 ________________________________________ Android 3246 Others 202 Windows 67 iOS 56 Name: os, dtype: int64 ________________________________________ yes 2359 no 1212 Name: 4g, dtype: int64 ________________________________________ no 3419 yes 152 Name: 5g, dtype: int64 ________________________________________ Low 3325 Medium 237 High 9 Name: brand_type, dtype: int64 ________________________________________
df.info()
<class 'pandas.core.frame.DataFrame'> Int64Index: 3571 entries, 0 to 3570 Data columns (total 16 columns): # Column Non-Null Count Dtype --- ------ -------------- ----- 0 brand_name 3571 non-null category 1 os 3571 non-null category 2 screen_size 3571 non-null float64 3 4g 3571 non-null category 4 5g 3571 non-null category 5 main_camera_mp 3571 non-null float64 6 selfie_camera_mp 3571 non-null float64 7 int_memory 3571 non-null float64 8 ram 3571 non-null float64 9 battery 3571 non-null float64 10 weight 3571 non-null float64 11 release_year 3571 non-null int64 12 days_used 3571 non-null int64 13 new_price 3571 non-null float64 14 used_price 3571 non-null float64 15 brand_type 3571 non-null category dtypes: category(5), float64(9), int64(2) memory usage: 354.1 KB
print(df.brand_name.unique())
['Honor', 'Others', 'HTC', 'Huawei', 'Infinix', ..., 'Microsoft', 'Panasonic', 'Sony', 'Spice', 'XOLO'] Length: 34 Categories (34, object): ['Honor', 'Others', 'HTC', 'Huawei', ..., 'Panasonic', 'Sony', 'Spice', 'XOLO']
print(df.os.unique())
['Android', 'Others', 'iOS', 'Windows'] Categories (4, object): ['Android', 'Others', 'iOS', 'Windows']
print(df['4g'].unique())
['yes', 'no'] Categories (2, object): ['yes', 'no']
print(df['5g'].unique())
['no', 'yes'] Categories (2, object): ['no', 'yes']
print(df['brand_type'].unique())
['Low', 'Medium', 'High'] Categories (3, object): ['Low' < 'Medium' < 'High']
df.describe(include='all').T
| count | unique | top | freq | mean | std | min | 25% | 50% | 75% | max | |
|---|---|---|---|---|---|---|---|---|---|---|---|
| brand_name | 3571 | 34 | Others | 509 | NaN | NaN | NaN | NaN | NaN | NaN | NaN |
| os | 3571 | 4 | Android | 3246 | NaN | NaN | NaN | NaN | NaN | NaN | NaN |
| screen_size | 3571.0 | NaN | NaN | NaN | 14.521803 | 4.165771 | 6.985 | 12.7 | 13.49 | 16.51 | 22.225 |
| 4g | 3571 | 2 | yes | 2359 | NaN | NaN | NaN | NaN | NaN | NaN | NaN |
| 5g | 3571 | 2 | no | 3419 | NaN | NaN | NaN | NaN | NaN | NaN | NaN |
| main_camera_mp | 3571.0 | NaN | NaN | NaN | 9.299619 | 4.53065 | 0.08 | 5.0 | 8.0 | 13.0 | 25.0 |
| selfie_camera_mp | 3571.0 | NaN | NaN | NaN | 5.972417 | 5.287271 | 0.3 | 2.0 | 5.0 | 8.0 | 17.0 |
| int_memory | 3571.0 | NaN | NaN | NaN | 44.583202 | 38.938853 | 0.005 | 16.0 | 32.0 | 64.0 | 136.0 |
| ram | 3571.0 | NaN | NaN | NaN | 4.0 | 0.0 | 4.0 | 4.0 | 4.0 | 4.0 | 4.0 |
| battery | 3571.0 | NaN | NaN | NaN | 3042.193083 | 1272.573404 | 80.0 | 2100.0 | 3000.0 | 4000.0 | 6850.0 |
| weight | 3571.0 | NaN | NaN | NaN | 164.415584 | 41.098253 | 74.0 | 140.0 | 159.0 | 184.0 | 250.0 |
| release_year | 3571.0 | NaN | NaN | NaN | 2015.964996 | 2.291784 | 2013.0 | 2014.0 | 2016.0 | 2018.0 | 2020.0 |
| days_used | 3571.0 | NaN | NaN | NaN | 675.391487 | 248.640972 | 91.0 | 536.0 | 690.0 | 872.0 | 1094.0 |
| new_price | 3571.0 | NaN | NaN | NaN | 221.841506 | 135.411699 | 9.13 | 120.13 | 189.8 | 291.935 | 549.6425 |
| used_price | 3571.0 | NaN | NaN | NaN | 95.528957 | 66.145611 | 2.51 | 45.205 | 75.53 | 126.0 | 247.1925 |
| brand_type | 3571 | 3 | Low | 3325 | NaN | NaN | NaN | NaN | NaN | NaN | NaN |
df.isna().sum()
brand_name 0 os 0 screen_size 0 4g 0 5g 0 main_camera_mp 0 selfie_camera_mp 0 int_memory 0 ram 0 battery 0 weight 0 release_year 0 days_used 0 new_price 0 used_price 0 brand_type 0 dtype: int64
df.isnull().sum()
brand_name 0 os 0 screen_size 0 4g 0 5g 0 main_camera_mp 0 selfie_camera_mp 0 int_memory 0 ram 0 battery 0 weight 0 release_year 0 days_used 0 new_price 0 used_price 0 brand_type 0 dtype: int64
df['os'] = df['os'].astype('category')
df['brand_name'] = df['brand_name'].astype('category')
df['5g'] = df['5g'].astype('category')
df['4g'] = df['4g'].astype('category')
df['brand_type'] = df['brand_type'].astype('category')
sns.histplot(df.screen_size, kde=True);
sns.boxplot(df.screen_size,orient = "h");
sns.histplot(df.main_camera_mp, kde=True);
sns.boxplot(df.main_camera_mp,orient = "h");
sns.histplot(df.selfie_camera_mp, kde=True);
sns.histplot(df.int_memory, kde=True,binwidth=50);
sns.histplot(df.ram, kde=True);
sns.histplot(df.battery, kde=True);
sns.histplot(df.weight, kde=True, binwidth=50);
sns.histplot(df.release_year, kde=True);
sns.histplot(df.days_used, kde=True);
sns.histplot(df.new_price, kde=True, binwidth=50);
sns.histplot(df.used_price, kde=True, binwidth=50);
sns.histplot(df.os);
sns.histplot(df['4g']);
sns.histplot(df['5g']);
sns.histplot(df.brand_type);
sns.heatmap(df.corr(),cmap="YlGnBu",vmin=-1,vmax=1, annot=True)
<AxesSubplot:>
fig = plt.figure(figsize= (10,5))
ax = sns.regplot(x ='release_year', y = df['used_price'], data = df)
fig = plt.figure(figsize= (10,5))
ax = sns.regplot(x ='days_used', y = df['used_price'], data = df)
fig = plt.figure(figsize= (10,5))
ax = sns.regplot(x ='new_price', y = df['used_price'], data = df)
fig = plt.figure(figsize= (10,5))
ax = sns.regplot(x ='ram', y = df['used_price'], data = df)
fig = plt.figure(figsize= (10,5))
ax = sns.regplot(x ='int_memory', y = df['used_price'], data = df)
fig = plt.figure(figsize= (10,5))
ax = sns.regplot(x ='weight', y = df['used_price'], data = df)
fig = plt.figure(figsize= (10,5))
ax = sns.regplot(x ='battery', y = df['used_price'], data = df)
fig = plt.figure(figsize= (10,5))
ax = sns.regplot(x ='main_camera_mp', y = df['used_price'], data = df)
fig = plt.figure(figsize= (10,5))
ax = sns.regplot(x ='selfie_camera_mp', y = df['used_price'], data = df)
fig = plt.figure(figsize= (10,5))
ax = sns.regplot(x ='screen_size', y = df['used_price'], data = df)
sns.countplot(x="os", hue="brand_type", data=df, palette='Set1',saturation=50 );
sns.countplot(x="os", hue="5g", data=df, palette='Set1',saturation=50 );
Observations Very few Android based phones have 5g ALl the other, Windows and iOS os based phones have no 5g
sns.countplot(x="5g", hue="brand_type", data=df, palette='Set1',saturation=50 );
There are many low brand_type phones that donot have 5g. High end phones donot have 5g
sns.countplot(x="5g", hue="4g", data=df, palette='Set1',saturation=50 );
Very few phones have both 4g and 5g
plt.figure(figsize=(15,5))
ax = sns.barplot(x='os', y='used_price', data=data)
iOS based used_phones have high used_price. Other os based and windows based used_phones have low used_price
plt.figure(figsize=(15,5))
ax = sns.barplot(x='4g', y='used_price', data=data)
The used_phones with 4g enables have high used_price
plt.figure(figsize=(15,5))
ax = sns.barplot(x='5g', y='used_price', data=data)
The used_phones with 5g enables have high used_price.
#figure = plt.figure(figsize=(8,7))
#sns.barplot(x="brand_name", y="used_price", data=df, hue='os', palette='tab10' )
#plt.show()
g = sns.FacetGrid(df, col="brand_type", height=4, aspect=.5)
g.map(sns.barplot, "os", "used_price", order=["Android", "Others", "iOS", "Windows"])
<seaborn.axisgrid.FacetGrid at 0x1ff6682aeb0>
g = sns.FacetGrid(df, col="brand_type", height=4, aspect=.5)
g.map(sns.barplot, "4g", "used_price", order=["yes", "no"])
<seaborn.axisgrid.FacetGrid at 0x1ff66a43d00>
g = sns.FacetGrid(df, col="brand_type", height=4, aspect=.5)
g.map(sns.barplot, "5g", "used_price", order=["yes", "no"])
<seaborn.axisgrid.FacetGrid at 0x1ff66b32610>
sns.histplot(df.used_price, kde=True, binwidth=50);
Maximum used_price is ~250 Euros. There are many phones used used price is below 100 Euros. There distribution is right skewed.
sns.histplot(df.os);
labeled_barplot(df, "os", perc=True)
About 90.9% of the used_phones are dominated by Android devices
g = sns.FacetGrid(df, col="brand_name")
g.map(sns.histplot, "ram")
<seaborn.axisgrid.FacetGrid at 0x1ff66372790>
RAM seems to be discrete and constant in nature. All the phone have range between 3.5GB and 4.5GB Others brand have maximum number of used_phones.
df["heavybat"] = np.where(df["battery"] > 4500, "heavybat", "lightbat")
plt.figure(figsize=(15,5))
ax = sns.barplot(x=df['heavybat'], y='weight', data=data)
Heavy Battery phones seems to be heavy than the light battery phones.
df["screen_grp"] = np.where(df["screen_size"] > 6, "larger_screen", "smaller_screen")
df["screen_grp"].value_counts()
larger_screen 3571 Name: screen_grp, dtype: int64
sns.histplot(df.screen_grp);
Now the number of big screens phones larger than 6 inches have increased. There are 3571 used_phones that have scree_size greater than 6 inches.
temp = df[df['selfie_camera_mp'] > 8].copy()
temp
| brand_name | os | screen_size | 4g | 5g | main_camera_mp | selfie_camera_mp | int_memory | ram | battery | weight | release_year | days_used | new_price | used_price | brand_type | heavybat | screen_grp | |
|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|
| 1 | Honor | Android | 22.225 | yes | yes | 13.0 | 16.0 | 128.0 | 4.0 | 4300.0 | 213.0 | 2020 | 325 | 249.3900 | 161.4900 | Low | lightbat | larger_screen |
| 8 | Honor | Android | 15.720 | yes | no | 13.0 | 16.0 | 128.0 | 4.0 | 4000.0 | 165.0 | 2020 | 161 | 200.3200 | 150.8800 | Low | lightbat | larger_screen |
| 12 | Honor | Android | 22.225 | yes | yes | 13.0 | 16.0 | 128.0 | 4.0 | 4100.0 | 206.0 | 2019 | 537 | 499.6800 | 247.1925 | Low | lightbat | larger_screen |
| 13 | Honor | Others | 8.730 | no | no | 13.0 | 16.0 | 4.0 | 4.0 | 455.0 | 74.0 | 2019 | 432 | 179.6200 | 89.7700 | Low | lightbat | larger_screen |
| 14 | Honor | Android | 15.720 | yes | no | 13.0 | 16.0 | 64.0 | 4.0 | 4000.0 | 171.5 | 2019 | 336 | 221.1300 | 144.8700 | Low | lightbat | larger_screen |
| ... | ... | ... | ... | ... | ... | ... | ... | ... | ... | ... | ... | ... | ... | ... | ... | ... | ... | ... |
| 3555 | Samsung | Android | 15.880 | yes | no | 12.0 | 17.0 | 64.0 | 4.0 | 4000.0 | 169.0 | 2019 | 315 | 231.1915 | 149.3800 | Low | lightbat | larger_screen |
| 3556 | Samsung | Android | 16.510 | yes | no | 12.0 | 10.0 | 136.0 | 4.0 | 4300.0 | 196.0 | 2019 | 489 | 549.6425 | 247.1925 | Medium | lightbat | larger_screen |
| 3557 | Samsung | Android | 15.720 | yes | no | 12.0 | 10.0 | 136.0 | 4.0 | 3500.0 | 168.0 | 2019 | 429 | 518.5000 | 247.1925 | Medium | lightbat | larger_screen |
| 3558 | Samsung | Others | 6.985 | yes | no | 12.0 | 9.0 | 4.0 | 4.0 | 340.0 | 74.0 | 2019 | 524 | 240.6435 | 120.3800 | Low | lightbat | larger_screen |
| 3565 | Asus | Android | 22.225 | yes | no | 8.0 | 17.0 | 128.0 | 4.0 | 6000.0 | 240.0 | 2019 | 325 | 549.6425 | 247.1925 | Medium | heavybat | larger_screen |
666 rows × 18 columns
sns.histplot(temp.selfie_camera_mp);
sns.heatmap(df.corr(),cmap="YlGnBu",vmin=-1,vmax=1, annot=True)
<AxesSubplot:>
df = df.drop(['heavybat','screen_grp'],axis =1)
# defining X and y variables.
# ram variable is highly skewed. For most of the phones it is 4GB. Hence, dropping the column
X = df.drop(['used_price','brand_name', 'ram'], axis = 1) #ram
y = df['used_price']
X.head()
| os | screen_size | 4g | 5g | main_camera_mp | selfie_camera_mp | int_memory | battery | weight | release_year | days_used | new_price | brand_type | |
|---|---|---|---|---|---|---|---|---|---|---|---|---|---|
| 0 | Android | 22.225 | yes | no | 13.0 | 5.0 | 64.0 | 3020.0 | 146.0 | 2020 | 127 | 111.62 | Low |
| 1 | Android | 22.225 | yes | yes | 13.0 | 16.0 | 128.0 | 4300.0 | 213.0 | 2020 | 325 | 249.39 | Low |
| 2 | Android | 22.225 | yes | yes | 13.0 | 8.0 | 128.0 | 4200.0 | 213.0 | 2020 | 162 | 359.47 | Low |
| 3 | Android | 22.225 | yes | yes | 13.0 | 8.0 | 64.0 | 6850.0 | 250.0 | 2020 | 345 | 278.93 | Low |
| 4 | Android | 15.720 | yes | no | 13.0 | 8.0 | 64.0 | 5000.0 | 185.0 | 2020 | 293 | 140.87 | Low |
y.head()
0 86.9600 1 161.4900 2 247.1925 3 180.2300 4 103.8000 Name: used_price, dtype: float64
X = pd.get_dummies(X, columns = X.select_dtypes(include = ["object","category"]).columns.tolist(), drop_first = True)
X.head()
| screen_size | main_camera_mp | selfie_camera_mp | int_memory | battery | weight | release_year | days_used | new_price | os_Others | os_Windows | os_iOS | 4g_yes | 5g_yes | brand_type_Medium | brand_type_High | |
|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|
| 0 | 22.225 | 13.0 | 5.0 | 64.0 | 3020.0 | 146.0 | 2020 | 127 | 111.62 | 0 | 0 | 0 | 1 | 0 | 0 | 0 |
| 1 | 22.225 | 13.0 | 16.0 | 128.0 | 4300.0 | 213.0 | 2020 | 325 | 249.39 | 0 | 0 | 0 | 1 | 1 | 0 | 0 |
| 2 | 22.225 | 13.0 | 8.0 | 128.0 | 4200.0 | 213.0 | 2020 | 162 | 359.47 | 0 | 0 | 0 | 1 | 1 | 0 | 0 |
| 3 | 22.225 | 13.0 | 8.0 | 64.0 | 6850.0 | 250.0 | 2020 | 345 | 278.93 | 0 | 0 | 0 | 1 | 1 | 0 | 0 |
| 4 | 15.720 | 13.0 | 8.0 | 64.0 | 5000.0 | 185.0 | 2020 | 293 | 140.87 | 0 | 0 | 0 | 1 | 0 | 0 | 0 |
y.head()
0 86.9600 1 161.4900 2 247.1925 3 180.2300 4 103.8000 Name: used_price, dtype: float64
# Splitting the data into test and train data
x_train, x_test, y_train, y_test = train_test_split(X,y, test_size = 0.3, random_state = 42)
x_train_ols = x_train.copy()
y_train_ols = y_train.copy()
x_test_ols = x_test.copy()
y_test_ols = y_test.copy()
x_train.shape[0]
2499
x_test.shape[0]
1072
There are 2499 rows in train data and 1072 rows in test data
# fit the model on the train data
from sklearn.linear_model import LinearRegression
linearregression = LinearRegression()
linearregression.fit(x_train, y_train)
LinearRegression()
coef_df = pd.DataFrame(np.append(linearregression.coef_, linearregression.intercept_), index = x_train.columns.tolist()+ ["Intercept"], columns = ["Coefficients"])
coef_df
| Coefficients | |
|---|---|
| screen_size | 0.094370 |
| main_camera_mp | -0.309414 |
| selfie_camera_mp | 0.616280 |
| int_memory | 0.094922 |
| battery | -0.000129 |
| weight | -0.005222 |
| release_year | 0.186794 |
| days_used | -0.085353 |
| new_price | 0.405613 |
| os_Others | -4.789019 |
| os_Windows | -1.233302 |
| os_iOS | 8.658309 |
| 4g_yes | -3.792003 |
| 5g_yes | 2.710973 |
| brand_type_Medium | -14.382445 |
| brand_type_High | -26.377297 |
| Intercept | -314.959332 |
# Adjusted r-square
def adj_r2(predictors, targets, predictions):
r2 = r2_score(targets, predictions)
n = predictors.shape[0]
k = predictors.shape[1]
return 1-((1-r2)*(n-1)/(n-k-1))
def mape_score(targets, predictions):
return np.mean(np.abs(targets -predictions)/targets) * 100
def model_perf_regression(model, predictors, target):
pred = model.predict(predictors)
r2 = r2_score(target,pred)
adjr2 = adj_r2(predictors, target, pred)
rmse = np.sqrt(mean_squared_error(target,pred))
mae = mean_absolute_error(target, pred)
mape = mape_score(target,pred)
# creating dataframe of metrics
df_performance = pd.DataFrame({
"RMSE": rmse,
"MAE": mae,
"R-Squared": r2,
"Adj. R-Squared": adjr2,
"MAPE": mape,
},
index=[0],
)
return df_performance
# check train set performance
print("Training set performance\n")
linearregression_train_performance = model_perf_regression(linearregression, x_train, y_train)
linearregression_train_performance
Training set performance
| RMSE | MAE | R-Squared | Adj. R-Squared | MAPE | |
|---|---|---|---|---|---|
| 0 | 13.980191 | 10.344831 | 0.955009 | 0.954718 | 19.115553 |
# check test set performance
print("Test Set Performance\n")
linearregression_test_performance = model_perf_regression(linearregression, x_test, y_test)
linearregression_test_performance
Test Set Performance
| RMSE | MAE | R-Squared | Adj. R-Squared | MAPE | |
|---|---|---|---|---|---|
| 0 | 13.617565 | 10.161106 | 0.95823 | 0.957597 | 16.589589 |
# Statsmodel doesnot add a constant to the data on its own. Here the constant needs to be added manually.
x_train1 = sm.add_constant(x_train) # has_constant='add'
x_test1 = sm.add_constant(x_test)
olsmod0 = sm.OLS(y_train, x_train1).fit()
print(olsmod0.summary())
OLS Regression Results
==============================================================================
Dep. Variable: used_price R-squared: 0.955
Model: OLS Adj. R-squared: 0.955
Method: Least Squares F-statistic: 3293.
Date: Fri, 20 Aug 2021 Prob (F-statistic): 0.00
Time: 22:01:53 Log-Likelihood: -10137.
No. Observations: 2499 AIC: 2.031e+04
Df Residuals: 2482 BIC: 2.041e+04
Df Model: 16
Covariance Type: nonrobust
=====================================================================================
coef std err t P>|t| [0.025 0.975]
-------------------------------------------------------------------------------------
const -314.9593 540.308 -0.583 0.560 -1374.460 744.541
screen_size 0.0944 0.121 0.779 0.436 -0.143 0.332
main_camera_mp -0.3094 0.087 -3.561 0.000 -0.480 -0.139
selfie_camera_mp 0.6163 0.100 6.165 0.000 0.420 0.812
int_memory 0.0949 0.010 9.445 0.000 0.075 0.115
battery -0.0001 0.000 -0.305 0.760 -0.001 0.001
weight -0.0052 0.012 -0.432 0.666 -0.029 0.018
release_year 0.1868 0.268 0.697 0.486 -0.339 0.712
days_used -0.0854 0.002 -47.774 0.000 -0.089 -0.082
new_price 0.4056 0.004 106.127 0.000 0.398 0.413
os_Others -4.7890 1.355 -3.533 0.000 -7.447 -2.131
os_Windows -1.2333 2.066 -0.597 0.551 -5.285 2.819
os_iOS 8.6583 2.473 3.501 0.000 3.809 13.508
4g_yes -3.7920 0.874 -4.338 0.000 -5.506 -2.078
5g_yes 2.7110 1.693 1.601 0.109 -0.609 6.030
brand_type_Medium -14.3824 1.627 -8.842 0.000 -17.572 -11.193
brand_type_High -26.3773 6.428 -4.104 0.000 -38.981 -13.773
==============================================================================
Omnibus: 227.169 Durbin-Watson: 1.978
Prob(Omnibus): 0.000 Jarque-Bera (JB): 491.543
Skew: 0.569 Prob(JB): 1.83e-107
Kurtosis: 4.851 Cond. No. 7.39e+06
==============================================================================
Notes:
[1] Standard Errors assume that the covariance matrix of the errors is correctly specified.
[2] The condition number is large, 7.39e+06. This might indicate that there are
strong multicollinearity or other numerical problems.
Checking for the below assumptions:
No Multicollinearity
Linearity of variables
Independence of error terms
Normality of error terms
No Heteroscedasticity
from statsmodels.stats.outliers_influence import variance_inflation_factor
def checking_vif(predictors):
vif = pd.DataFrame()
vif["feature"] = predictors.columns
vif["VIF"] = [variance_inflation_factor(predictors.values,i)
for i in range(len(predictors.columns))
]
return vif
checking_vif(x_train1)
| feature | VIF | |
|---|---|---|
| 0 | const | 3.707300e+06 |
| 1 | screen_size | 3.231782e+00 |
| 2 | main_camera_mp | 2.009739e+00 |
| 3 | selfie_camera_mp | 3.556360e+00 |
| 4 | int_memory | 1.934750e+00 |
| 5 | battery | 3.597409e+00 |
| 6 | weight | 3.070146e+00 |
| 7 | release_year | 4.818425e+00 |
| 8 | days_used | 2.548222e+00 |
| 9 | new_price | 3.377643e+00 |
| 10 | os_Others | 1.332737e+00 |
| 11 | os_Windows | 1.021507e+00 |
| 12 | os_iOS | 1.132857e+00 |
| 13 | 4g_yes | 2.194265e+00 |
| 14 | 5g_yes | 1.438091e+00 |
| 15 | brand_type_Medium | 2.025408e+00 |
| 16 | brand_type_High | 1.047651e+00 |
# initial list of columns
cols = x_train1.columns.tolist()
# setting an initial max p-value
max_p_value = 1
while len(cols) > 0:
# defining the train set
x_train_aux = x_train1[cols]
# fitting the model
model = sm.OLS(y_train, x_train_aux).fit()
# getting the p-values and the maximum p-value
p_values = model.pvalues
max_p_value = max(p_values)
# name of the variable with maximum p-value
feature_with_p_max = p_values.idxmax()
if max_p_value > 0.05:
cols.remove(feature_with_p_max)
else:
break
selected_features = cols
print(selected_features)
['main_camera_mp', 'selfie_camera_mp', 'int_memory', 'release_year', 'days_used', 'new_price', 'os_Others', 'os_iOS', '4g_yes', 'brand_type_Medium', 'brand_type_High']
x_train2 = x_train1[selected_features]
x_test2 = x_test1[selected_features]
olsmod2 = sm.OLS(y_train, x_train2).fit()
print(olsmod2.summary())
OLS Regression Results
=======================================================================================
Dep. Variable: used_price R-squared (uncentered): 0.985
Model: OLS Adj. R-squared (uncentered): 0.985
Method: Least Squares F-statistic: 1.517e+04
Date: Fri, 20 Aug 2021 Prob (F-statistic): 0.00
Time: 22:02:06 Log-Likelihood: -10140.
No. Observations: 2499 AIC: 2.030e+04
Df Residuals: 2488 BIC: 2.037e+04
Df Model: 11
Covariance Type: nonrobust
=====================================================================================
coef std err t P>|t| [0.025 0.975]
-------------------------------------------------------------------------------------
main_camera_mp -0.3342 0.083 -4.041 0.000 -0.496 -0.172
selfie_camera_mp 0.6697 0.088 7.621 0.000 0.497 0.842
int_memory 0.0974 0.010 9.781 0.000 0.078 0.117
release_year 0.0309 0.001 45.263 0.000 0.030 0.032
days_used -0.0865 0.001 -59.610 0.000 -0.089 -0.084
new_price 0.4059 0.003 118.789 0.000 0.399 0.413
os_Others -4.9837 1.253 -3.977 0.000 -7.441 -2.526
os_iOS 8.6099 2.421 3.557 0.000 3.863 13.356
4g_yes -3.6670 0.792 -4.629 0.000 -5.220 -2.114
brand_type_Medium -14.0870 1.580 -8.918 0.000 -17.185 -10.989
brand_type_High -25.5026 6.387 -3.993 0.000 -38.027 -12.978
==============================================================================
Omnibus: 230.585 Durbin-Watson: 1.976
Prob(Omnibus): 0.000 Jarque-Bera (JB): 492.220
Skew: 0.581 Prob(JB): 1.31e-107
Kurtosis: 4.838 Cond. No. 4.87e+04
==============================================================================
Notes:
[1] R² is computed without centering (uncentered) since the model does not contain a constant.
[2] Standard Errors assume that the covariance matrix of the errors is correctly specified.
[3] The condition number is large, 4.87e+04. This might indicate that there are
strong multicollinearity or other numerical problems.
#j=0
#for i in ('battery', 'weight', 'os_Windows', 'release_year'):
# print(i)
# x_train_j = sm.add_constant(x_train.drop(i,axis=1)) # has_constant='add'
# x_test_j = sm.add_constant(x_test.drop(i,axis=1))
# olsmod_j = sm.OLS(y_train, x_train_j).fit()
# print(olsmod_j.summary())
#j=j+1
# Statsmodel doesnot add a constant to the data on its own. Here the constant needs to be added manually.
#olsmod1 = sm.OLS(y_train, x_train1.drop("release_year", axis=1)).fit()
#print(olsmod1.summary())
#checking_vif(x_train1.drop("release_year", axis=1))
# let us create a dataframe with aactual, fitted and residual values
df_pred = pd.DataFrame()
df_pred["Actual Values"] = y_train # actual values
df_pred["Fitted Values"] = olsmod2.fittedvalues # predicted values
df_pred["Residuals"] = olsmod2.resid # residuals
df_pred.head()
| Actual Values | Fitted Values | Residuals | |
|---|---|---|---|
| 844 | 100.48 | 102.074477 | -1.594477 |
| 1539 | 111.68 | 118.123664 | -6.443664 |
| 3452 | 113.89 | 110.904820 | 2.985180 |
| 1727 | 64.09 | 69.434766 | -5.344766 |
| 1926 | 67.95 | 68.864293 | -0.914293 |
# let's plot the fitted values vs residuals
sns.residplot(
data=df_pred, x="Fitted Values", y="Residuals", color="purple", lowess=True
)
plt.xlabel("Fitted Values")
plt.ylabel("Residuals")
plt.title("Fitted vs Residual plot")
plt.show()
sns.histplot(data=df_pred, x="Residuals", kde=True)
plt.title("Normality of residuals")
plt.show()
The histogram of residuals does have bell shape
import pylab
import scipy.stats as stats
stats.probplot(df_pred["Residuals"], dist="norm", plot=pylab)
plt.show()
The residuals follow a straight line except tails. Check for Shapiro-Walk test
stats.shapiro(df_pred["Residuals"])
ShapiroResult(statistic=0.9706025719642639, pvalue=3.120287435108585e-22)
import statsmodels.stats.api as sms
from statsmodels.compat import lzip
name = ["F statistic", "p-value"]
test = sms.het_goldfeldquandt(df_pred["Residuals"], x_train2)
lzip(name, test)
[('F statistic', 1.0468462382983947), ('p-value', 0.2102941600456584)]
Now, Moving on to Prediction Part.
# predictions on the test set
pred = olsmod2.predict(x_test2)
df_pred_test = pd.DataFrame({"Actual": y_test, "Predicted": pred})
df_pred_test.sample(10, random_state=1)
| Actual | Predicted | |
|---|---|---|
| 2098 | 30.5200 | 21.695159 |
| 278 | 195.6700 | 192.070230 |
| 26 | 247.1925 | 229.477471 |
| 2910 | 89.9700 | 91.721518 |
| 2631 | 69.2000 | 64.044985 |
| 1582 | 89.5800 | 109.812760 |
| 2110 | 247.1925 | 264.449155 |
| 3160 | 65.3400 | 65.208595 |
| 2817 | 115.7700 | 106.862134 |
| 549 | 39.2900 | 47.590057 |
df1 = df_pred_test.sample(25, random_state=1)
df1.plot(kind="bar", figsize=(15, 7))
plt.show()
# checking model performance on train set (seen 70% data)
print("Training Performance\n")
olsmod2_train_perf = model_perf_regression(olsmod2, x_train2, y_train)
olsmod2_train_perf
Training Performance
| RMSE | MAE | R-Squared | Adj. R-Squared | MAPE | |
|---|---|---|---|---|---|
| 0 | 13.992121 | 10.363142 | 0.954932 | 0.954732 | 19.24802 |
# checking model performance on test set (seen 30% data)
print("Test Performance\n")
olsmod2_test_perf = model_perf_regression(olsmod0, x_test1, y_test)
olsmod2_test_perf
Test Performance
| RMSE | MAE | R-Squared | Adj. R-Squared | MAPE | |
|---|---|---|---|---|---|
| 0 | 13.617565 | 10.161106 | 0.95823 | 0.957557 | 16.589589 |
# training performance comparison
models_train_comp_df = pd.concat(
[linearregression_train_performance.T, olsmod2_train_perf.T],
axis=1,
)
models_train_comp_df.columns = [
"Linear Regression sklearn",
"Linear Regression statsmodels",
]
print("Training performance comparison:")
models_train_comp_df
Training performance comparison:
| Linear Regression sklearn | Linear Regression statsmodels | |
|---|---|---|
| RMSE | 13.980191 | 13.992121 |
| MAE | 10.344831 | 10.363142 |
| R-Squared | 0.955009 | 0.954932 |
| Adj. R-Squared | 0.954718 | 0.954732 |
| MAPE | 19.115553 | 19.248020 |
The performance of both the models seem to be colse to each other
# test performance comparison
models_test_comp_df = pd.concat(
[linearregression_test_performance.T, olsmod2_test_perf.T],
axis=1,
)
models_test_comp_df.columns = [
"Linear Regression sklearn",
"Linear Regression statsmodels",
]
print("Test performance comparison:")
models_test_comp_df
Test performance comparison:
| Linear Regression sklearn | Linear Regression statsmodels | |
|---|---|---|
| RMSE | 13.617565 | 13.617565 |
| MAE | 10.161106 | 10.161106 |
| R-Squared | 0.958230 | 0.958230 |
| Adj. R-Squared | 0.957597 | 0.957557 |
| MAPE | 16.589589 | 16.589589 |
Both the models seem to be similar
olsmodel_final = sm.OLS(y_train, x_train2).fit()
print(olsmodel_final.summary())
OLS Regression Results
=======================================================================================
Dep. Variable: used_price R-squared (uncentered): 0.985
Model: OLS Adj. R-squared (uncentered): 0.985
Method: Least Squares F-statistic: 1.517e+04
Date: Fri, 20 Aug 2021 Prob (F-statistic): 0.00
Time: 22:03:25 Log-Likelihood: -10140.
No. Observations: 2499 AIC: 2.030e+04
Df Residuals: 2488 BIC: 2.037e+04
Df Model: 11
Covariance Type: nonrobust
=====================================================================================
coef std err t P>|t| [0.025 0.975]
-------------------------------------------------------------------------------------
main_camera_mp -0.3342 0.083 -4.041 0.000 -0.496 -0.172
selfie_camera_mp 0.6697 0.088 7.621 0.000 0.497 0.842
int_memory 0.0974 0.010 9.781 0.000 0.078 0.117
release_year 0.0309 0.001 45.263 0.000 0.030 0.032
days_used -0.0865 0.001 -59.610 0.000 -0.089 -0.084
new_price 0.4059 0.003 118.789 0.000 0.399 0.413
os_Others -4.9837 1.253 -3.977 0.000 -7.441 -2.526
os_iOS 8.6099 2.421 3.557 0.000 3.863 13.356
4g_yes -3.6670 0.792 -4.629 0.000 -5.220 -2.114
brand_type_Medium -14.0870 1.580 -8.918 0.000 -17.185 -10.989
brand_type_High -25.5026 6.387 -3.993 0.000 -38.027 -12.978
==============================================================================
Omnibus: 230.585 Durbin-Watson: 1.976
Prob(Omnibus): 0.000 Jarque-Bera (JB): 492.220
Skew: 0.581 Prob(JB): 1.31e-107
Kurtosis: 4.838 Cond. No. 4.87e+04
==============================================================================
Notes:
[1] R² is computed without centering (uncentered) since the model does not contain a constant.
[2] Standard Errors assume that the covariance matrix of the errors is correctly specified.
[3] The condition number is large, 4.87e+04. This might indicate that there are
strong multicollinearity or other numerical problems.
-
The Linear regression model is able to explain 98.5% of the variance with a confidence level of 95%.
Selfie camera resolution, internal memory, release year, new price have a positive effect on Used price. Every unit increase in selfie camera resolution results in 0.67 times increase in used price. A unit increase in new price effects a 0.4 units increase in used price. An iOS device has 8.6 units increase in used price. Every unit increase in internal memory results in 0.1 units increase in used price. A unit increase in release year causes a 0.03 unit increase in used price.
Main camera resolution, days used, Operating system other than Android, iOS, Windows, 4g capability and brand type of medium/high has a negative effect on Used price. Every unit increase in Main camera resolution results in 0.33 units decrease in used price. Every unit increase in days used causes a 0.08 unit decrease in used price. A non iOS/Android/Windows device has 5 units decrease in used price. A 4g enabled device has 3.66 units less price. Brands of type Medium (I.e with New price between 500 and 1500) have 14 units decrease in used price. Brands of type High (I.e with New price between 1500 and 3000) have 26 units decrease in used price.
Following categories may not fetch much revenues:
Additionally, Given there are very less iOS phones, higher used prices can be set for such phones. But they may cater only to a niche segment. Even though Non iOS/Android/Windows phones are less, their predicted prices are also less. Hence dont invest in sales of these models. Most of the used_phones fall below 250 Euros price range. Hence this will be sector to concentrate for mass sales. Higher prices can be set for phones wiht higher selfie camera resolution which seems to be the latest trend. Some incentives / discounts can be given on phones with just higher main camera resolution as prices are not inclined towards such phones.